Starbucks managemnent wants to know who are our best and worst customers and the effect of offers on them.
The following problems are expected to solve:
This analysis will help Starbucks to review the marketing strategies and providers a platform to come up with better offers for low volume customers and renewed strategy to retain the high volume and high frequent regular customers.
# Import libraries
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from scipy import stats
from scipy import interpolate
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
from sklearn.decomposition import PCA
# Set image parameters for the notebook
plt.rcParams['figure.figsize'] = [5, 5]
sns.set_style("white")
# Set a color palette
starbucks_palette = ["#00704A", "#362415", "#eac784", "#604c4c",'gold','black']
# Remove column display limits
pd.set_option('display.max_columns', None)
# pd.set_option('display.height', None)
pd.set_option('display.max_rows', None)
# pd.set_option('display.width', None)
Read Non-transformed merged data and prepare the data for Clustering . Remove columns like time, offer_id
# Read clean data
starbucks = pd.read_csv('starbucks.csv',index_col='customer_id')
# Print
starbucks.head()
starbucks.shape
starbucks_drop = starbucks[starbucks['offer_received'] == 1]
starbucks_final = starbucks_drop.drop([ 'Unnamed: 0', 'offer_id','received_time','viewed_time','transaction_time','completed_time','time_viewed_received','time_completed_received','time_completed_viewed','email','offer_received','transaction'], axis = 1)
starbucks_final.dtypes
starbucks_final.shape
starbucks_final.offer_completed.value_counts(),starbucks_final.bogo.value_counts(),starbucks_final.discount.value_counts(),starbucks_final.informational.value_counts(),starbucks_final.difficulty.value_counts()
This is to identify highly correlated and less significant features . Identifying if we have anymore low-variance features using Variance Thershold Feature Selector and using Bernoulli variables to select the thershold.
We want to remove all features that has same value in more than 90% of the samples.
Since we have already taken care of high correlated features in our initial Data Modelling and Feature Selection, we do not have any low-variance features to remove.
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=(.9 * (1 - .9)))
new = selector.fit_transform(starbucks_final)
new_inverse =selector.inverse_transform(new)
new.shape
def color_correlation(val):
"""
Takes a scalar and returns a string with
the css property `'color: red'` for negative
strings, black otherwise.
"""
color = 'black' if ( val > -0.75 and val < 0.75 ) else 'red'
return 'color: %s' % color
starbucks_final.corr().style.applymap(color_correlation)
fig, ax = plt.subplots(figsize=(16,8))
sns.heatmap(starbucks_final.corr(), vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
);
Read standardized merged data and remove columns like time, offer_id
# Read clean data
starbucks_std = pd.read_csv('starbucks_standarlized.csv',index_col='customer_id')
# Print
starbucks_std.head()
starbucks_std_drop = starbucks_std[starbucks_std['offer_received'] == 1]
starbucks_std_final = starbucks_std_drop.drop([ 'Unnamed: 0', 'offer_id','time_viewed_received','time_completed_received','time_completed_viewed','email','offer_received','transaction'], axis = 1)
starbucks_std_final.dtypes
starbucks_std_final.shape
def fit_pca(df, n_components,fit_trans = 'fit'):
'''
Uses sklearn.decomposition.PCA to fit a PCA model on "df".
Parameters
----------
df: A pandas.DataFrame. Comes from delta.csv.
n_components: An int. Number of principal components to keep.
Returns
-------
An sklearn.decomposition.pca.PCA instance.
'''
# YOUR CODE HERE
pca = PCA(n_components= n_components)
if fit_trans == 'fit':
pca = pca.fit(df)
else:
pca = pca.fit_transform(df)
#print(pca)
return pca
def plot_scaled_variance(pca):
'''
Plots the variance explained by each of the principal components.
Features are scaled with sklearn.StandardScaler.
Parameters
----------
pca: An sklearn.decomposition.pca.PCA instance.
Returns
-------
A matplotlib.Axes instance.
'''
# YOUR CODE HERE
fig, ax = plt.subplots(figsize=(10,6))
ax.set_xlabel('Dimension #')
ax.set_ylabel('Explained Variance Ratio')
ax.set_title('Fraction of Explained Variance')
ax.plot(pca.explained_variance_ratio_,color = '#00704A')
return ax
def scree_plot(pca):
'''
Creates a scree plot associated with the principal components
INPUT: pca - the result of instantian of PCA in scikit learn
OUTPUT:
None
'''
num_components = len(pca.explained_variance_ratio_)
ind = np.arange(num_components)
vals = pca.explained_variance_ratio_
plt.figure(figsize=(20, 6))
ax = plt.subplot(111)
cumvals = np.cumsum(vals)
ax.bar(ind, vals,color=starbucks_palette)
ax.plot(ind, cumvals,color = '#00704A')
# for i in range(num_components):
# ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
ax.xaxis.set_tick_params(width=0)
ax.yaxis.set_tick_params(width=2, length=12)
ax.set_xlabel("Principal Component")
ax.set_ylabel("Variance Explained (%)")
plt.title('Explained Variance Per Principal Component')
# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
def plot_weight(df, pca, ith) :
'''
plot weight for the i-th principal component to corresponding feature names
Input :
df = original dataframe
pca = pca.fit(data)
ith = int value, from 1 to len(pca.n_components)
Return : axes of barplot
'''
sort_pca = sorted([(weight, label) for weight,label in zip(pca.components_[ith-1], df.columns) ])
weights, features = zip(*sort_pca)
weights, features = list(weights), list(features)
fig, ax = plt.subplots(figsize=(10,20))
ax = sns.barplot(weights, features,palette = starbucks_palette)
return ax
We should use Scaled data for dimension reduction and for Clustering Algorithms
Clustering algorithms are most affected by the range of features since they use distances between data points to determine their similarity.Since the features have different scales with income having higher value than the rest, 100% weightage is given to income. This will impact the performance of the machine learning algorithm and and lead to bias towards one feature.
pca_fit_non_std = fit_pca(starbucks_final, starbucks_std_final.shape[1],'fit')
ax = plot_scaled_variance(pca_fit_non_std)
Income Feature explains 100% of the variance
plot_weight(starbucks_final,pca_fit_non_std, 1)
Component 1 explains little over 30% Component 2 explains around 17%
pca_fit = fit_pca(starbucks_std_final, starbucks_std_final.shape[1],'fit')
ax = plot_scaled_variance(pca_fit)
features = range(pca_fit.n_components_)
plt.bar(features, pca_fit.explained_variance_,color = '#00704A')
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()
7 Components explain 90% of the explained variance. Since we set our thershold to be 90% , we will use 7 components for our dimension reduction
#scree plot
scree_plot(pca_fit)
#PCA
# number of PCA components that explained at least 95% variance ==> 9 components
cumvals = np.cumsum(pca_fit.explained_variance_ratio_)
print("Number of Main PCA components that explained at least 85% variance : {}".format(np.where(cumvals >= 0.85)[0][0]+1))
print("Number of Main PCA components that explained at least 90% variance : {}".format(np.where(cumvals >= 0.90)[0][0]+1))
print("Number of Main PCA components that explained at least 95% variance : {}".format(np.where(cumvals >= 0.95)[0][0]+1))
Perform validation on the reduced dataset to check on the explained variance and the weights. Rewards,Difficulty,amount given higher weights and it is apt for our analysis since our focus is on the customers responsive to offers and high expenditure
pca_fit_reduced = fit_pca(starbucks_std_final, 7)
ax = plot_scaled_variance(pca_fit_reduced)
features = range(pca_fit_reduced.n_components_)
plt.bar(features, pca_fit_reduced.explained_variance_,color=starbucks_palette)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()
#scree plot
scree_plot(pca_fit_reduced)
#PCA
cumvals = np.cumsum(pca_fit_reduced.explained_variance_ratio_)
print("Number of Main PCA components that explained at least 90% variance : {}".format(np.where(cumvals >= 0.90)[0][0]+1))
pca_transform = fit_pca(starbucks_std_final, 8,"transform")
plot_weight(starbucks_std_final,pca_fit_reduced, 1)
np.save('starbucks_reduced_7_offer_only.npy', pca_transform)
For this dataset, following models will be implemented and the results will be analyzed and compared to chose the better one.
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils import check_random_state
from sklearn.cluster import KMeans
def cluster(array, random_state, n_clusters=4):
'''
Fits and predicts k-means clustering on "array"
Parameters
----------
array: A numpy array
random_state: Random seed, e.g. check_random_state(0)
n_clusters: The number of clusters. Default: 4
Returns
-------
A tuple (sklearn.KMeans, np.ndarray)
'''
kmeans = KMeans(n_clusters = n_clusters, init = 'k-means++', max_iter = 45000,
n_init = 10, random_state = random_state)
#fit the array,#Predict the clusters
model = kmeans.fit(array)
clusters = kmeans.predict(array)
# Getting the cluster centers
cluster_centroid = model.cluster_centers_
#print(clusters)
#print(model)
#print(kmeans)
return model, clusters
def plot_inertia(array, start=1, end=10):
'''
Increase the number of clusters from "start" to "end" (inclusive).
Finds the inertia of k-means clustering for different k.
Plots inertia as a function of the number of clusters.
Parameters
----------
array: A numpy array.
start: An int. Default: 1
end: An int. Default: 10
Returns
-------
A matplotlib.Axes instance.
'''
#Your code is here
# Increase the number of clusters from "start" to "end" (inclusive).
# Finds the inertia of k-means clustering for different k.
#x_axis = range(start,end+1)
x_axis = []
inertia = []
for i in range(start, end+1):
kmeans = KMeans(n_clusters = i, init = 'k-means++',
max_iter = 4500, n_init = 10, random_state = 0)
kmeans.fit(array)
inertia.append(kmeans.inertia_)
x_axis.append(i)
#Plots inertia as a function of the number of clusters.
fig, ax = plt.subplots(figsize=(10,6))
ax.set_title('The elbow method')
ax.set_ylabel('Inertia')
ax.set_xlabel('Number of clusters')
plt.plot(x_axis, inertia,color = '#00704A')
return ax
def plot_pair(reduced, clusters):
'''
Uses seaborn.PairGrid to visualize the data distribution
when axes are the first four principal components.
Diagonal plots are histograms. The off-diagonal plots are scatter plots.
Parameters
----------
reduced: A numpy array. Comes from importing delta_reduced.npy
Returns
-------
A seaborn.axisgrid.PairGrid instance.
'''
df = pd.DataFrame(reduced)
df['c'] = clusters
subset = [0,1,2,3, 'c']
columns = [0,1,2,3]
if clusters.any() == -1:
# Black used for noise.
color = 'k'
ax = sns.PairGrid(df[subset], vars = columns, hue = 'c')
ax = ax.map_diag(plt.hist)
ax = ax.map_offdiag(plt.scatter)
return ax
from sklearn.metrics import silhouette_score
from tqdm import tqdm
def plot_silhouette(reduced, min_clusters =2, max_clusters =11):
'''
Uses seaborn.PairGrid to visualize the data distribution
when axes are the first four principal components.
Diagonal plots are histograms. The off-diagonal plots are scatter plots.
Parameters
----------
reduced: A numpy array.
min_clusters - Start with minimum cluster value
max_clusters - maximum cluster value
Returns
-------
A seaborn.axisgrid.PairGrid instance.
'''
silh = []
clusters = range(min_clusters,max_clusters)
for n in tqdm(clusters):
model = KMeans(n_clusters = n, random_state=42)
preds = model.fit_predict(reduced)
silhouette_avg = silhouette_score(reduced, preds)
silh.append(silhouette_avg)
print("For n_clusters = ", n,"The avg silhouette_score is :", silhouette_avg)
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(clusters, silh,marker="o",color = '#00704A')
ax.set_ylabel("Silhoutte Score")
ax.set_xlabel('Number of clusters')
Create Silhouette and Inertia Plot to identify the number of cluster to be considered for creating K means algorithm Silhouette Coefficient: Silhouette Coefficient or silhouette score is a metric used to calculate the goodness of a clustering technique. Its value ranges from -1 to 1. 1: Means clusters are well apart from each other and clearly distinguished. 0: Means clusters are indifferent, or we can say that the distance between clusters is not significant. -1: Means clusters are assigned in the wrong way.
The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. The Silhouette Coefficient for a sample is (b - a) / max(a, b). To clarify, b is the distance between a sample and the nearest cluster that the sample is not a part of. Note that Silhouette Coefficient is only defined if number of labels is 2 <= n_labels <= n_samples - 1.
In short, the average silhouette approach measures the quality of a clustering. That is, it determines how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The average silhouette method computes the average silhouette of observations for different values of k. The optimal number of clusters k is the one that maximizes the average silhouette over a range of possible values for k.
plot_silhouette(pca_transform)
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
print(__doc__)
X=pca_transform
range_n_clusters = [2, 3, 4, 5, 6,7,8,9]
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
c=colors, edgecolor='k')
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
c="white", alpha=1, s=200, edgecolor='k')
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
s=50, edgecolor='k')
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
inertia = plot_inertia(pca_transform)
Low Silhoutee Coefficient 0.28 towards 0 indicates clusters are indifferent and are overlapping and the distance between clusters is not significant..Elbow Diagram indicates the same with no clear Separation
k_means_t, cluster_t = cluster(pca_transform, random_state=check_random_state(1), n_clusters=6)
k_means_plot_df = pd.concat([starbucks_final.reset_index(drop=True),pd.DataFrame(pca_transform)], axis =1)
k_means_plot_df.columns.values[-7: ] = ['PCA_Component1', 'PCA_Component2', 'PCA_Component3', 'PCA_Component4', 'PCA_Component5', 'PCA_Component6', 'PCA_Component7','PCA_Component7']
k_means_plot_df['Cluster'] = cluster_t
k_means_plot_df.groupby('Cluster').count()['offer_completed']
Plot of PCA Component1 and Component2 - to visualize how the data is spread and separated. As we can see , we see lot of data points are spread all over and overlapping, which may explain the low Silhoutte Score.
x_ax = k_means_plot_df['PCA_Component1']
y_ax = k_means_plot_df['PCA_Component2']
plt.figure(figsize = (12,10))
sns.scatterplot(x_ax,y_ax,color = '#00704A')
plt.title('Scatter Plot of First Two PCA Components')
plt.show()
k_means_plot_df.head()
Plot the Clusters for the first 2 components of the reduced dataset and visualize how well the clusters are separated /distanced from each other.
#Visualising the clusters
plt.figure(figsize = (12,10))
plt.scatter(k_means_plot_df.loc[k_means_plot_df['Cluster'] ==0,['PCA_Component1']], k_means_plot_df.loc[k_means_plot_df['Cluster'] ==0,['PCA_Component2']], s = 75,
c = 'red', label = 'Cluster 1')
plt.scatter(k_means_plot_df.loc[k_means_plot_df['Cluster'] ==1,['PCA_Component1']], k_means_plot_df.loc[k_means_plot_df['Cluster'] ==1,['PCA_Component2']], s = 75,
c = 'blue', label = 'Cluster 2')
plt.scatter(k_means_plot_df.loc[k_means_plot_df['Cluster'] ==2,['PCA_Component1']], k_means_plot_df.loc[k_means_plot_df['Cluster'] ==2,['PCA_Component2']], s = 75,
c = 'green', label = 'Cluster 3')
plt.scatter(k_means_plot_df.loc[k_means_plot_df['Cluster'] ==3,['PCA_Component1']], k_means_plot_df.loc[k_means_plot_df['Cluster'] ==3,['PCA_Component2']], s = 75,
c = 'orange', label = 'Cluster 4')
plt.scatter(k_means_plot_df.loc[k_means_plot_df['Cluster'] ==4,['PCA_Component1']], k_means_plot_df.loc[k_means_plot_df['Cluster'] ==4,['PCA_Component2']], s = 75,
c = 'purple', label = 'Cluster 5')
plt.scatter(k_means_plot_df.loc[k_means_plot_df['Cluster'] ==5,['PCA_Component1']], k_means_plot_df.loc[k_means_plot_df['Cluster'] ==5,['PCA_Component2']], s = 75,
c = 'indigo', label = 'Cluster 6')
#plt.scatter(k_means_plot_df.loc[k_means_plot_df['Cluster'] ==6,['PCA_Component1']], k_means_plot_df.loc[k_means_plot_df['Cluster'] ==6,['PCA_Component2']], s = 75,
# c = 'violet', label = 'Cluster 7')
#Plotting the centroids of the clusters
plt.scatter(k_means_t.cluster_centers_[:, 0],
k_means_t.cluster_centers_[:, 1], s = 200,
c = 'yellow', label = 'Centroids')
for i, c in enumerate(k_means_t.cluster_centers_):
plt.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
s=50, edgecolor='k')
plt.legend()
As expected , with our earlier observation of low silhouette coefficient , we can identify that clusters are not evenly separated and overlapping each other, with Cluster 0 densely populated and Cluster 4 scarcely populated . Also the high spending customers who are identified as outliers are grouped as one clusters(Cluster 4). We will do further analysis on the Cluster features and identify what group of Customers contributed in each Cluster.
from sklearn.datasets.samples_generator import make_blobs
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import metrics
#We can calculate the distance from each point to its closest neighbour using the NearestNeighbors.
#The point itself is included in n_neighbors.
neigh = NearestNeighbors(n_neighbors=5)
nbrs = neigh.fit(pca_transform)
#The kneighbors method returns two arrays,
#one which contains the distance to the closest n_neighbors points
#and the other which contains the index for each of those points.
distances, indices = nbrs.kneighbors(pca_transform)
#sort and plot results
plt.figure(figsize = (10,6))
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
Tried Varying combinations of eps and min_samples ranging from eps value of 0.4 to 1.3 and min sample of 2 to 100.
dbsc = DBSCAN(eps = 0.90, min_samples = 20).fit(pca_transform)
#Get the cluster labels
labels = dbsc.labels_
###print (labels)
#Identify the core and border points
core_samples = np.zeros_like(labels, dtype = bool)
core_samples[dbsc.core_sample_indices_] = True
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
##print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels))
#print("Completeness: %0.3f" % metrics.completeness_score(y, labels))
#print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))
#print("Adjusted Rand Index: %0.3f"
# % metrics.adjusted_rand_score(y, labels))
#print("Adjusted Mutual Information: %0.3f"
# % metrics.adjusted_mutual_info_score(y, labels))
#print("Silhouette Coefficient: %0.3f"
# % metrics.silhouette_score(X, labels))
# Plot result
import matplotlib.pyplot as plt
plt.figure(figsize = (20,15))
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = pca_transform[class_member_mask & core_samples]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = pca_transform[class_member_mask & ~core_samples]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
dbsc = DBSCAN(eps = 1.10, min_samples = 20).fit(pca_transform)
#Get the cluster labels
labels = dbsc.labels_
###print (labels)
#Identify the core and border points
core_samples = np.zeros_like(labels, dtype = bool)
core_samples[dbsc.core_sample_indices_] = True
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
##print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels))
#print("Completeness: %0.3f" % metrics.completeness_score(y, labels))
#print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))
#print("Adjusted Rand Index: %0.3f"
# % metrics.adjusted_rand_score(y, labels))
#print("Adjusted Mutual Information: %0.3f"
# % metrics.adjusted_mutual_info_score(y, labels))
#print("Silhouette Coefficient: %0.3f"
# % metrics.silhouette_score(X, labels))
eps = 1.00 and min_sample = 20 is the selected one for DBSCAN.
This combination of the values gave optimal number of clusters and with less noise. Otherwise combinations lead to either more clusters or more noise.
dbsc = DBSCAN(eps = 1.00, min_samples = 20).fit(pca_transform)
#Get the cluster labels
labels = dbsc.labels_
###print (labels)
#Identify the core and border points
core_samples = np.zeros_like(labels, dtype = bool)
core_samples[dbsc.core_sample_indices_] = True
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
##print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels))
#print("Completeness: %0.3f" % metrics.completeness_score(y, labels))
#print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))
#print("Adjusted Rand Index: %0.3f"
# % metrics.adjusted_rand_score(y, labels))
#print("Adjusted Mutual Information: %0.3f"
# % metrics.adjusted_mutual_info_score(y, labels))
#print("Silhouette Coefficient: %0.3f"
# % metrics.silhouette_score(X, labels))
# Plot result
import matplotlib.pyplot as plt
plt.figure(figsize = (20,15))
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = pca_transform[class_member_mask & core_samples]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = pca_transform[class_member_mask & ~core_samples]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
DBSCAN algorithm captured high volume transactions/outliers as noise. Apart from that there is better segregation between the customer of different offer types and difficulty . Though there are too many clusters , better customer segregation based on the offer used, amount spent than K-means
pg = plot_pair(pca_transform, labels)
Update the DBSCAN Cluster results to the output Dataframe for cluster analysis
k_means_plot_df['DBSCAN_Cluster'] = dbsc.labels_
k_means_plot_df.groupby('DBSCAN_Cluster').count()['offer_completed']
from sklearn.cluster import AgglomerativeClustering
import scipy
from scipy.cluster.hierarchy import dendrogram, linkage
def dendo(data,link_mthd,getxlbl,px=12,py=10):
mergings = linkage(data, method=link_mthd)
companies = getxlbl.index.tolist()
# Plot the dendrogram
plt.figure(figsize=(px,py))
dendrogram(mergings,
labels= companies,
leaf_rotation=90.,
leaf_font_size=8)
plt.show()
Create Dendogram to determine the number of CLusters
Caution: Dendogram runs over 8 hours for this dataset.
#Create Dendogram
dendo(pca_transform,'ward',starbucks_final)
Set Threshold of 300 and determine the clusters - Number of Clusters=3
cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
cluster_array = cluster.fit_predict(pca_transform)
cluster_array
# Plot result
import matplotlib.pyplot as plt
plt.figure(figsize = (20,15))
# Black removed and is used for noise instead.
unique_labels = set(cluster_array)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (cluster_array == k)
xy = pca_transform[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
plt.title('Hierarchical Agglomerative Clustering - Clusters 3')
plt.show()
Update the Cluster results to the output Dataframe for cluster analysis
k_means_plot_df['Hier_Agg_Cluster'] = cluster_array
k_means_plot_df.groupby('Hier_Agg_Cluster').count()['offer_completed']
Set Threshold of 250 and determine the clusters - Nmmber of Clusters = 6
cluster_6 = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='ward')
cluster_array_6 = cluster_6.fit_predict(pca_transform)
cluster_array_6
k_means_plot_df['Hier_Agg_Cluster_6'] = cluster_array_6
k_means_plot_df.groupby('Hier_Agg_Cluster_6').count()['offer_completed']
# Plot result
import matplotlib.pyplot as plt
plt.figure(figsize = (20,15))
# Black removed and is used for noise instead.
unique_labels = set(cluster_array_6)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (cluster_array_6 == k)
xy = pca_transform[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
plt.title('Hierarchical Agglomerative Clustering - Clusters 6')
plt.show()
Hierarchical Cluster with Number of Clusters as 6 gave similar results to K Means in terms of data separation with Cluster 0 more dense than the other.
Modelled using the standardized data and plot using the cluster lables for all 3 models ( K-Means,DBSCAN,Hierarchical Clustering)
""" Visualise Cluster with TSNE"""
from sklearn.manifold import TSNE
tsne = TSNE(random_state=42,learning_rate=100).fit_transform(pca_transform)
tsne_df = pd.DataFrame(tsne, columns=['xs', 'ys'])
tsne_df['cluster'] = ['cluster_' + str(i) for i in cluster_t]
#plot tsne
plt.figure(figsize = (20,15))
#sns.scatterplot('xs','ys', hue='cluster', data=tsne_df, ax=ax, hue_order=['cluster_' + str(i) for i in range(0,7)])
ax.set_title("K means Cluster with TSNE")
sns.lmplot(x='xs', y='ys', data=tsne_df, hue='cluster', fit_reg=False,hue_order=['cluster_' + str(i) for i in range(0,6)])
tsne_df = pd.DataFrame(tsne, columns=['xs', 'ys'])
tsne_df['dbcluster'] = ['cluster_' + str(i) for i in dbsc.labels_]
#plot tsne
plt.figure(figsize = (20,15))
ax.set_title("DBSCAN Cluster with TSNE")
sns.lmplot(x='xs', y='ys', data=tsne_df, hue='dbcluster', fit_reg=False,hue_order=['cluster_' + str(i) for i in range(0,11)])
tsne_df = pd.DataFrame(tsne, columns=['xs', 'ys'])
tsne_df['haggcluster'] = ['cluster_' + str(i) for i in cluster_array]
#plot tsne
plt.figure(figsize = (20,15))
ax.set_title("Agglomerative Hierarchical Cluster with TSNE")
sns.lmplot(x='xs', y='ys', data=tsne_df, hue='haggcluster', fit_reg=False,hue_order=['cluster_' + str(i) for i in range(0,3)])
tsne_df = pd.DataFrame(tsne, columns=['xs', 'ys'])
tsne_df['haggcluster_6'] = ['cluster_' + str(i) for i in cluster_array_6]
#plot tsne
plt.figure(figsize = (20,15))
ax.set_title("Agglomerative Hierarchical Cluster with TSNE")
sns.lmplot(x='xs', y='ys', data=tsne_df, hue='haggcluster_6', fit_reg=False,hue_order=['cluster_' + str(i) for i in range(0,6)])
t-sne Visualization showed much clear separation for DBSCAN than the other two algorithms which reflects in our 2-d plots of the first 2 components
tsne_df.head()
This model is considered with the assumption that K-Means and DBScan is not able to understand the distribution and it was recommended to give the Gaussian Mixture Model (GMM),EM algorithm a try. This clustering method could give better results and in that case, it assumes that the input variables are a mix of Gaussian (bell-shaped) distributions. We are giving a shot here to see if this model performs better than the rest.
A Gaussian mixture model is a probabilistic model that assumes all the data points are generated from a mixture of a finite number of Gaussian distributions with unknown parameters. One can think of mixture models as generalizing k-means clustering to incorporate information about the covariance structure of the data as well as the centers of the latent Gaussians.
Scikit-learn implements different classes to estimate Gaussian mixture models, that correspond to different estimation strategies, detailed below.
!pip install gmm-mml
from gmm_mml import GmmMml
##gmm=GmmMml(plots=True)
##gmm.fit(pca_transform)
from matplotlib import style
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
def plot_gmm(gmm, X, label=True, ax=None):
ax = ax or plt.gca()
labels = gmm.fit(X).predict(X)
if label:
ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
else:
ax.scatter(X[:, 0], X[:, 1], s=40, zorder=2)
w_factor = 0.2 / gmm.weights_.max()
for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
draw_ellipse(pos, covar, alpha=w * w_factor)
plt.title("GMM with %d components"%len(gmm.means_), fontsize=(20))
plt.xlabel("U.A.")
plt.ylabel("U.A.")
def SelBest(arr:list, X:int)->list:
'''
returns the set of X configurations with shorter distance
'''
dx=np.argsort(arr)[:X]
return arr[dx]
n_clusters=np.arange(2, 12)
bics=[]
bics_err=[]
iterations=20
for n in n_clusters:
tmp_bic=[]
for _ in range(iterations):
gmm=GaussianMixture(n, n_init=2).fit(pca_transform)
tmp_bic.append(gmm.bic(pca_transform))
val=np.mean(SelBest(np.array(tmp_bic), int(iterations/5)))
err=np.std(tmp_bic)
bics.append(val)
bics_err.append(err)
fig = plt.figure(figsize=(15,10))
plt.errorbar(n_clusters[0:10],bics[0:10], yerr=bics_err[0:10], label='BIC',color = starbucks_palette[0])
plt.title("BIC Scores", fontsize=20)
plt.xticks(n_clusters[0:10])
plt.xlabel("N. of clusters")
plt.ylabel("Score")
plt.legend()
n_clusters
fig = plt.figure(figsize=(15,10))
plt.errorbar(n_clusters[0:10], np.gradient(bics)[0:10], yerr=bics_err[0:10], label='BIC',color = starbucks_palette[0])
plt.title("Gradient of BIC Scores", fontsize=20)
plt.xticks(n_clusters[0:10])
plt.xlabel("N. of clusters")
plt.ylabel("grad(BIC)")
plt.legend()
Selecting Number of Clusters as 5 Since change in BIC scores is less significant and flattened compare to the change from 4 to 5.
X=pca_transform
# Stratch dataset to get ellipsoid data
#X = np.dot(X,np.random.RandomState(0).randn(2,2))
#x,y = np.meshgrid(np.sort(X[:,0]),np.sort(X[:,1]))
#XY = np.array([x.flatten(),y.flatten()]).T
GMM = GaussianMixture(n_components=5).fit(X) # Instantiate and fit the model
# Check if the model has converged
print('Converged:',GMM.converged_)
GMM_predict = GaussianMixture(n_components=5).fit_predict(X)
means = GMM.means_
covariances = GMM.covariances_
GMM_predict = GMM.predict(X)
print(GMM_predict)
np.unique(GMM_predict, return_counts=True)
# Plot result
import matplotlib.pyplot as plt
plt.figure(figsize = (20,15))
# Black removed and is used for noise instead.
unique_labels = set(GMM_predict)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (GMM_predict == k)
xy = pca_transform[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
plt.title('Gaussian Mixture')
plt.show()
Also tried to build model with the 6 Clusters.
X=pca_transform
# Stratch dataset to get ellipsoid data
#X = np.dot(X,np.random.RandomState(0).randn(2,2))
#x,y = np.meshgrid(np.sort(X[:,0]),np.sort(X[:,1]))
#XY = np.array([x.flatten(),y.flatten()]).T
GMM = GaussianMixture(n_components=6).fit(X) # Instantiate and fit the model
# Check if the model has converged
print('Converged:',GMM.converged_)
GMM_predict = GaussianMixture(n_components=6).fit_predict(X)
means = GMM.means_
covariances = GMM.covariances_
GMM_predict = GMM.predict(X)
print(GMM_predict)
np.unique(GMM_predict, return_counts=True)
# Plot result
import matplotlib.pyplot as plt
plt.figure(figsize = (20,15))
# Black removed and is used for noise instead.
unique_labels = set(GMM_predict)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (GMM_predict == k)
xy = pca_transform[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
plt.title('Gaussian Mixture')
plt.show()
The results of GMM Model closely resembles the K means and not much improvement from the other models. Need more understand of the GMM and other optimization techniques to further the analysis and to better intercept the model. This will be taken as a future work and continue to study on this given dataset.
Group the age and income for easier grouping and plotting
ages = ['20-', '20-29', '30-39', '40-49', '50-59', '60-69', '70+']
k_means_plot_df['age_group'] = pd.cut(k_means_plot_df.age, bins=[0, 20, 30, 40, 50, 60, 70, 120], right = False, labels = ages)
# Create income groups
ages = ['40K-', '40-59K', '60-79K', '80-99K', '100K+']
k_means_plot_df['income_group'] = pd.cut(k_means_plot_df.income,
bins=[0, 40000, 60000, 80000, 100000, 300000], right = False, labels = ages)
k_means_plot_df.to_csv("Starbucks_Cluster_Output.csv")
k_means_plot_df.columns
k_means_plot_df.groupby(['age_group', 'income_group']).agg({'amount':['sum'],
'male':['sum'],
'offer_completed' : ['sum'],
'total_reward' : ['sum'],
'bogo':['sum'],
'discount':['sum'],
})
plt.rcParams['figure.figsize'] = [15, 10]
sns.boxplot(x = "age_group", y = "amount", hue = "Cluster", data = k_means_plot_df)
plt.title("Spending by age group against each Cluster")
Cluster 4 captures all the high volume transactions across all age groups and it is clearly separated from the rest of the plot. Need to understand more about the customer purchase pattern and reason to understand if they are bulk corporate of family orders and how frequent these customers make the high volume purchases. Clearly customers over 70+ contributed more high volume transactions than the rest.
This will help create specific campaigns for this focus group to attract more bulk transactions
sns.boxplot(x = "income_group", y = "amount", hue = "Cluster", data = k_means_plot_df)
plt.title("Spending by income group against each Cluster")
Customers in the income group of 80-99K contibuted more high volume transactions. Unsuprisingly high income group spent the most.
sns.boxplot(x = "bogo", y = "amount", hue = "Cluster", data = k_means_plot_df[k_means_plot_df['bogo'] == 1])
plt.title("Spending in each Cluster and number of bogo offers")
sns.boxplot(x = "discount", y = "amount", hue = "Cluster", data = k_means_plot_df[k_means_plot_df['discount'] == 1])
plt.title("Spending in each Cluster and number of Discount offers")
Higher mean transaction amount was spent for both Bogo and Discount in the cluster 4.
sns.boxplot(x = "male", y = "amount", hue = "Cluster", data = k_means_plot_df[k_means_plot_df['discount'] == 1])
plt.title("Gender Based Spending in each Cluster ")
Thre is no clear differences between male and female spending on discount in the cluster 0,3 and 5 .Female has more high volume transactions than male.
Below plots to visualization how each feature is segregated between the cluster which will help to make some solid conclusion
f, ax= plt.subplots(figsize=(10,5))
sns.countplot('Cluster', hue='male', data= k_means_plot_df, ax=ax)
ax.set_title("Gender Distribution")
f, ax= plt.subplots(figsize=(10,5))
sns.countplot('Cluster', hue='bogo', data= k_means_plot_df, ax=ax)
ax.set_title("Bogo Distribution")
f, ax= plt.subplots(figsize=(10,5))
sns.countplot('Cluster', hue='discount', data= k_means_plot_df, ax=ax)
ax.set_title("Discount Distribution")
f, ax= plt.subplots(figsize=(10,5))
sns.countplot('Cluster', hue='informational', data= k_means_plot_df, ax=ax)
ax.set_title("Informational Distribution")
f, ax= plt.subplots(figsize=(10,5))
sns.countplot('Cluster', hue='age_group', data= k_means_plot_df, ax=ax)
ax.set_title("Age_Group Distribution")
f, ax= plt.subplots(figsize=(10,5))
sns.countplot('Cluster', hue='income_group', data= k_means_plot_df, ax=ax)
ax.set_title("Income_Group Distribution")
Analyzed the spread of bogo and discount offers and thier difficulty across clusters
starbucks_final.groupby('difficulty')['bogo','discount'].sum()
k_means_plot_df.groupby(['difficulty','Cluster']).agg({'bogo':['sum'],'discount':['sum']})
f, ax= plt.subplots(figsize=(10,5))
sns.countplot('Cluster', hue='difficulty', data= k_means_plot_df, ax=ax)
ax.set_title("Difficulty Distribution")
f, ax= plt.subplots(figsize=(10,5))
sns.countplot('Cluster', hue='offer_completed', data= k_means_plot_df, ax=ax)
ax.set_title("Offer Completed Distribution")
f, ax= plt.subplots(figsize=(10,5))
sns.countplot('Cluster', hue='offer_viewed', data= k_means_plot_df, ax=ax)
ax.set_title("Offer Viewed Distribution")
from IPython.display import display
#figure, axes = plt.subplots(2, 2)
#k_means_plot_df.groupby('Cluster').agg({'amount':['sum']
# ,'male':['sum']
# ,'bogo' : ['sum']
# , 'discount' : ['sum']
# , 'income' : ['mean']
# }).plot.pie(figsize = (15,10),subplots='True',ax=axes[0,0],autopct='%1.1f%%',explode=(0.02,0,0,0,0.1,0),title="Total Expenditure by Cluster")
cluster_split_df = k_means_plot_df.groupby('Cluster').agg({'amount':['sum','mean']
,'male':['sum','count']
,'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
cluster_female_df= k_means_plot_df[k_means_plot_df['male'] == 0].groupby('Cluster').agg({'amount':['sum','mean']
,'male':['sum','count']
,'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
cluster_male_df= k_means_plot_df[k_means_plot_df['male'] == 1].groupby('Cluster').agg({'amount':['sum','mean']
,'male':['sum','count']
,'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
display('Total Aggregated Data by Cluster :',cluster_split_df)
display('Female Aggregated Data by Cluster :',cluster_female_df)
display('Male Aggregated Data by Cluster :',cluster_male_df)
figure, axes = plt.subplots(3, 2)
k_means_plot_df.groupby('Cluster').agg({'amount':['sum']
,'bogo' : ['sum']
, 'discount' : ['sum']
}).plot.pie(figsize = (15,10),subplots='True',ax=axes[0,0],autopct='%1.1f%%',explode=(0.02,0,0,0,0.1,0),title="Total Expenditure by Cluster")
#plt.rcParams['figure.figsize'] = [15, 10]
#k_means_plot_df.groupby('Cluster').agg({'offer_completed':['sum']
# }).plot.pie(subplots='True',autopct='%1.1f%%',explode=(0.02,0,0,0,0.2,0),colors=starbucks_palette)
plt.rcParams['figure.figsize'] = [15, 10]
k_means_plot_df[k_means_plot_df['Cluster'] != 2].groupby('Cluster').agg({'offer_completed':['sum']
}).plot.pie(subplots='True',autopct='%1.1f%%',explode=(0.02,0,0,0.2,0),colors=[ "#00704A", "#eac784","#362415", 'gold',"#604c4c",'black'])
Cluster 0 and 3 are our ideal customers with around 90% of the total transactions completed using offers .
dbcluster_split_df = k_means_plot_df.groupby('DBSCAN_Cluster').agg({'amount':['sum','mean']
,'male':['sum','count']
,'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
dbcluster_female_df= k_means_plot_df[k_means_plot_df['male'] == 0].groupby('DBSCAN_Cluster').agg({'amount':['sum','mean']
,'male':['sum','count']
,'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
dbcluster_male_df= k_means_plot_df[k_means_plot_df['male'] == 1].groupby('DBSCAN_Cluster').agg({'amount':['sum','mean']
,'male':['sum','count']
,'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
#k_means_plot_df.groupby('Cluster').agg({'male':['sum']}).plot.pie(figsize = (10,8),subplots='True',ax=axes[0,1],autopct='%1.1f%%',explode=(0.02,0,0,0,0.1,0),title="Total Male Population by Cluster")
display('Female Aggregated Data by Cluster :',dbcluster_female_df)
display('Male Aggregated Data by Cluster :',dbcluster_male_df)
display('Total Aggregated Data by Cluster :',dbcluster_split_df)
f, ax= plt.subplots(figsize=(10,5))
sns.countplot('DBSCAN_Cluster', hue='difficulty', data= k_means_plot_df, ax=ax)
ax.set_title("Difficulty Distribution")
## Filter and Plot Zero dollar transaction amount Clusters
non_active_dbscan_df = k_means_plot_df[k_means_plot_df['DBSCAN_Cluster'].isin([2,5,7,8])]
non_active_dbscan_df.head()
non_active_dbscan_df.offer_completed.value_counts()
plt.rcParams['figure.figsize'] = [15, 10]
non_active_dbscan_groupdf = non_active_dbscan_df.groupby('DBSCAN_Cluster').agg({'offer_viewed':['sum'],'male':['count']}).rename(columns={'male':'Total_Offers'})
#ax.set_xticks(["Offer_Viewed","Offer_Completed"])
r = [0,1,2,3]
#stacked_data = plotdata.apply(lambda x: x*100/sum(x), axis=1)
non_active_dbscan_groupdf.T.plot(kind="bar", stacked=True,grid = True,color=[ "#00704A", "#eac784","#362415", 'gold',"#604c4c",'black','orange','yellow','purple','red','violet','indigo'])
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
for p in ax.patches:
ax.annotate(np.round(p.get_height(),decimals=2),
(p.get_x()+p.get_width()/2., p.get_height()),
ha='center',va='center',xytext=(0, 10),textcoords='offset points')
plt.xticks(range(0,2), ["Offer_Viewed","Offer_Completed"])
plt.xlabel("Offers Viewed vs Offers Received")
plt.ylabel("Total Customers")
plt.show()
non_active_dbscan_groupdf.offer_viewed
Customer
plt.rcParams['figure.figsize'] = [15, 10]
k_means_plot_df.groupby('DBSCAN_Cluster').agg({'offer_completed':['sum']
}).plot.pie(subplots='True',autopct='%1.1f%%',colors=[ "#00704A", "#eac784","#362415", 'gold',"#604c4c",'black','orange','yellow','purple','red','violet','indigo'])
figure, axes = plt.subplots(3, 2)
k_means_plot_df.groupby('DBSCAN_Cluster').agg({'amount':['sum']
,'bogo' : ['sum']
, 'discount' : ['sum']
}).plot.pie(figsize = (15,10),subplots='True',ax=axes[0,0],autopct='%1.1f%%',title="Total Expenditure by Cluster")
dbcluster_split_df.plot
Cluster 9 - Customers Responsive to BOGO(difficulty 10) and not Discounts
High Volume Purchases are captured as noise
h3cluster_split_df = k_means_plot_df.groupby('Hier_Agg_Cluster').agg({'amount':['sum','mean']
, 'male':['sum','count']
, 'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
h3cluster_female_df= k_means_plot_df[k_means_plot_df['male'] == 0].groupby('Hier_Agg_Cluster').agg({'amount':['sum','mean']
,'male':['sum','count']
,'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
h3cluster_male_df= k_means_plot_df[k_means_plot_df['male'] == 1].groupby('Hier_Agg_Cluster').agg({'amount':['sum','mean']
,'male':['sum','count']
,'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
#k_means_plot_df.groupby('Cluster').agg({'male':['sum']}).plot.pie(figsize = (10,8),subplots='True',ax=axes[0,1],autopct='%1.1f%%',explode=(0.02,0,0,0,0.1,0),title="Total Male Population by Cluster")
display('Total Aggregated Data by Cluster :',h3cluster_split_df)
display('Female Aggregated Data by Cluster :',h3cluster_female_df)
display('Male Aggregated Data by Cluster :',h3cluster_male_df)
h3cluster_split_df.plot.pie(subplots=True, layout=(4,5))
plt.tight_layout()
plt.show()
h6cluster_split_df = k_means_plot_df.groupby('Hier_Agg_Cluster_6').agg({'amount':['sum','mean']
, 'male':['sum','count']
, 'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
h6cluster_female_df= k_means_plot_df[k_means_plot_df['male'] == 0].groupby('Hier_Agg_Cluster_6').agg({'amount':['sum','mean']
,'male':['sum','count']
,'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
h6cluster_male_df= k_means_plot_df[k_means_plot_df['male'] == 1].groupby('Hier_Agg_Cluster_6').agg({'amount':['sum','mean']
,'male':['sum','count']
,'bogo' : ['sum','count']
, 'age' : ['mean']
, 'discount' : ['sum','count']
, 'income' : ['mean']
, 'offer_completed' : ['sum','count']
, 'offer_viewed' : ['sum','count']
}).round(2)
#k_means_plot_df.groupby('Cluster').agg({'male':['sum']}).plot.pie(figsize = (10,8),subplots='True',ax=axes[0,1],autopct='%1.1f%%',explode=(0.02,0,0,0,0.1,0),title="Total Male Population by Cluster")
display('Total Aggregated Data by Cluster :',h6cluster_split_df)
display('Female Aggregated Data by Cluster :',h6cluster_female_df)
display('Male Aggregated Data by Cluster :',h6cluster_male_df)
h6cluster_split_df.plot.pie(subplots=True, layout=(4,5))
plt.tight_layout()
plt.show()
Conclusion: - With the Comprehensive Unsupervized models, comparison and analysis - we prefer DBSCAN over K-means and Hierarchical due to much clear segregation of the customer profiles over other eventhough high profile customers are captured as noise , it would be interesting to analysis more on thier profiles, historical transaction and purpose reason will help to understand more and conclude if they are true noise.
Starbucks to continue/review Marketing strategies for responsive customers and review and make up with better marketing strategies for customers, who are not that respective to the offers.